Finding the optimal number of clusters



In [12]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
pd.__version__ # need 0.14.0 for multiindex slicing









    



Populating the interactive namespace from numpy and matplotlib






    Out[12]:





'0.14.1'

Read files



In [13]:

    
# all, k=10...200, m=10...200, vertical only, wrong compression rate
ol = pd.read_table("overall_statistics_klarge.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack()
vl = pd.read_table("variable_statistics_klarge.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack()



In [14]:

    
# all, k=1...10, m=10...200, vertical only, wrong compression rate
os = pd.read_table("overall_statistics_ksmall.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack()
vs = pd.read_table("variable_statistics_ksmall.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack()



In [15]:

    
# 3D, k=6...15, m=160...250, vertical only, correct compression rate
o3d = pd.read_table("overall_statistics_3d.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack()
v3d = pd.read_table("variable_statistics_3d.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack()

Add fixed compression rates



In [16]:

    
N_c = 3008  # for all variables, vertical stacking
N_d = 48602 # for all variables, vertical stacking
original_size = N_c * N_d
compressed_size = lambda K, M: N_d + N_c * K + N_d * M + N_c * K * M
ol["compression_ratio_fixed"] = compressed_size(np.array(ol.index.get_level_values("K")),np.array(ol.index.get_level_values("M"))) / original_size
os["compression_ratio_fixed"] = compressed_size(np.array(os.index.get_level_values("K")),np.array(os.index.get_level_values("M"))) / original_size

Error vs compression ratio



In [17]:

    
# K large
grouped_data = vl.loc(axis=0)[10:100,:].mean(axis=1,level="STATISTIC").join(ol).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
plt.title("error vs compression ratio, by K")









    Out[17]:





<matplotlib.text.Text at 0x7fd7e8396c18>



In [18]:

    
# K small
grouped_data = vs.loc(axis=0)[:,:].mean(axis=1,level="STATISTIC").join(os).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
plt.title("error vs compression ratio, by K")









    Out[18]:





<matplotlib.text.Text at 0x7fd7e8257da0>



In [19]:

    
# K small, zoomed
grouped_data = vs.loc(axis=0)[5:10,:].mean(axis=1,level="STATISTIC").join(os).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
plt.title("error vs compression ratio, by K")
plt.xlim((0.05,0.07))
plt.ylim((0.002,0.0035))
#plt.xlim((0.08,0.1))
#plt.ylim((0.0013,0.002))









    Out[19]:





(0.002, 0.0035)



In [20]:

    
# K both, 3d only
grouped_data = v3d.loc(axis=0)[8:14,:].mean(axis=1,level="STATISTIC").join(o3d).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["compression_ratio"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
plt.title("error vs compression ratio, by K (3D only)")
plt.xlim((0.11,0.13))
plt.ylim((0.001,0.00115))









    Out[20]:





(0.001, 0.00115)

Time vs. error



In [21]:

    
# K small, zoomed
grouped_data = vs.loc(axis=0)[5:10,:].mean(axis=1,level="STATISTIC").join(os).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["time_solve"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("time to solve [s]")
plt.ylabel("mean rms error")
plt.title("error vs time to solve, by K")
plt.xlim((300,600))
plt.ylim((0.002,0.0035))
#plt.xlim((0.08,0.1))
#plt.ylim((0.0013,0.002))









    Out[21]:





(0.002, 0.0035)



In [22]:

    
# K both, 3d only
grouped_data = v3d.loc(axis=0)[8:14,:].mean(axis=1,level="STATISTIC").join(o3d).reset_index().groupby("K")
for key,grp in grouped_data:
    plt.plot(grp["time_solve"],grp["rms_error"],label="K = " + str(key))
plt.legend()
plt.xlabel("time to solve [s]")
plt.ylabel("mean rms error")
plt.title("error vs time to solve, by K (3D only)")
#plt.xlim((800,1100))
#plt.ylim((0.001,0.00115))









    Out[22]:





<matplotlib.text.Text at 0x7fd7e81821d0>